The given data is a dataset of police and subject interaction and actions taken on that moment. The data is pre-processed to make it suitable for analysis. lets load the data and check some entries from it.
# loading necessary packages
library(dplyr)
library(ggplot2)
library(tidyr)
library(plotly)
library(lubridate)
library(leaflet)
library(visdat)
library(corrplot)
# loading the given csv file
police_data <- read.csv("37-00049_UOF-P_2016_prepped.csv")
Also adding columns of months and days and number of types of force used. Changing ‘null’ & ‘Unknown’ values to ‘other’. Adding a new column “ID” to give unique id to each incidents.
library(dplyr)
# as we can see that there are double headings present in the data set ,
# removing the first row to clean the data
police_data = police_data[-1,]
str(police_data)
#police_data = police_data[complete.cases(policing_data),]
police_data <- data.frame(lapply(police_data, function(x) ifelse(is.na(x), "other", x)))
police_data <- data.frame(lapply(police_data, function(x) gsub("NULL", "other", x)))
police_data <- data.frame(lapply(police_data, function(x) gsub("Other", "other", x)))
police_data <- data.frame(lapply(police_data, function(x) gsub("Unknown", "other", x)))
police_data <- police_data %>% mutate(ID = row_number())
# select the columns in the desired order
police_data <- police_data[, c(ncol(police_data), 1:(ncol(police_data)-1))]
police_data <- police_data %>% mutate(INCIDENT_DATE = mdy(INCIDENT_DATE))
police_data <- police_data %>% mutate(OFFICER_HIRE_DATE = mdy(OFFICER_HIRE_DATE))
police_data$force_type_count <- sapply(strsplit(police_data$UOF_NUMBER, ", "), function(x) length(unique(x)))
police_data$month <- month(as.Date(police_data$INCIDENT_DATE))
police_data$day <- day(as.Date(police_data$INCIDENT_DATE))
head(police_data)
Using ggplot 2
officer_race <- aggregate(ID ~ OFFICER_RACE, data =police_data, FUN = length)
# Set color scheme for the plot
colors <- c("#004c6d", "#ffa600", "#000000", "#58508d", "#bc5090", "#ff5a5f")
plot1=ggplot(officer_race, aes(x = reorder(OFFICER_RACE, ID), y = ID, fill = OFFICER_RACE)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = colors) +
labs(title = "Number of Incidents by Officer Race",
x = "Officer Race",
y = "Number of Incidents") +
theme_minimal()
ggplotly(plot1)
Frequency of Races among the police officers can be seen here. Most of the officers are white followed by Hispanic and then Black and other ethnicities.
# Convert the table to a data frame
police_data$OFFICER_YEARS_ON_FORCE<-as.numeric(police_data$OFFICER_YEARS_ON_FORCE)
force_age <- as.data.frame(table(police_data$OFFICER_RACE,cut(police_data$OFFICER_YEARS_ON_FORCE, breaks = seq(0, 40, 5))))
# Rename the columns
colnames(force_age) <- c("OFFICER_RACE", "YEARS_ON_FORCE", "COUNT")
# Create a bar chart of the number of incidents by officer race and years on force
plot2=ggplot(force_age, aes(x = YEARS_ON_FORCE, y = COUNT, fill = OFFICER_RACE)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Number of incidents by officer race and years on force")
ggplotly(plot2)
This graph shows the number of incidents handled by the police officers and their experience in the bracket of 0-5, 5-10, 10-15, 15-20 and so on. The graph shows majority of the Officers have less than 5 years of experience, which are handling most of the cases.
library(knitr)
# Table of incidents by officer race
table_race <- table(police_data$OFFICER_RACE)
kable(table_race, caption = "Number of incidents by officer race",col.names = c("Officer Race", "Number of Incidents"))
| Officer Race | Number of Incidents |
|---|---|
| American Ind | 8 |
| Asian | 55 |
| Black | 341 |
| Hispanic | 482 |
| other | 27 |
| White | 1470 |
# Create a pie chart
pie(table_race, main = "incidents by officer race", labels = paste(names(table_race), ": ", table_race, sep = ""))
# Table of incidents by officer gender
table_gender <- table(police_data$OFFICER_GENDER)
kable(table_gender, caption = "Number of incidents by officer gender",col.names = c("Officer Gender", "Number of Incidents"))
| Officer Gender | Number of Incidents |
|---|---|
| Female | 240 |
| Male | 2143 |
# Table of incidents by subject race
table_subject_race <- table(police_data$SUBJECT_RACE)
kable(table_subject_race, caption = "Number of incidents by subject race",col.names = c("subject Race", "Number of Incidents"))
| subject Race | Number of Incidents |
|---|---|
| American Ind | 1 |
| Asian | 5 |
| Black | 1333 |
| Hispanic | 524 |
| other | 50 |
| White | 470 |
# Create a pie chart
pie(table_subject_race, main = "Incidents by subject race", labels = paste(names(table_subject_race), ": ", table_subject_race, sep = ""))
# Table of incidents by subject gender
table_subject_gender <- table(police_data$SUBJECT_GENDER)
kable(table_subject_gender, caption = "Number of incidents by subject gender",col.names = c("subject gender", "Number of Incidents"))
| subject gender | Number of Incidents |
|---|---|
| Female | 440 |
| Male | 1932 |
| other | 11 |
# Create a data frame with officer ID and race
officer_race <- police_data %>%
distinct(OFFICER_ID, OFFICER_RACE)
# Table of officers with multiple incidents
table_multiple <- table(police_data$OFFICER_ID[duplicated(police_data$OFFICER_ID)])
table_multiple_sorted <- table_multiple[order(-table_multiple)]
# Convert the table to a data frame and add officer race
table_multiple_df <- data.frame(OFFICER_ID = names(table_multiple_sorted),
Count = as.vector(table_multiple_sorted))
table_multiple_df_with_race <- left_join(table_multiple_df, officer_race,
by = "OFFICER_ID")
# Select the top 30 values
table_top_30_officers <- table_multiple_df_with_race %>%
arrange(desc(Count)) %>%
slice(1:30)
# Print the table
kable(table_top_30_officers, caption = "Officers with multiple incidents and their races")
| OFFICER_ID | Count | OFFICER_RACE |
|---|---|---|
| 10724 | 24 | White |
| 10697 | 20 | Hispanic |
| 10710 | 17 | White |
| 10818 | 15 | White |
| 10498 | 11 | White |
| 11015 | 11 | Black |
| 10695 | 10 | White |
| 10760 | 10 | White |
| 9925 | 10 | Black |
| 10351 | 9 | White |
| 8525 | 9 | White |
| 9881 | 9 | White |
| 10115 | 8 | Hispanic |
| 10455 | 8 | White |
| 10598 | 8 | White |
| 10612 | 8 | White |
| 8610 | 8 | White |
| 8759 | 8 | White |
| 8982 | 8 | other |
| 9119 | 8 | Hispanic |
| 9932 | 8 | White |
| 10313 | 7 | White |
| 10565 | 7 | White |
| 10624 | 7 | White |
| 10704 | 7 | White |
| 10767 | 7 | White |
| 10823 | 7 | Asian |
| 10864 | 7 | White |
| 10903 | 7 | White |
| 10969 | 7 | White |
As we can se the Outliers in this graph with officers involved in multiple incidents
boxplot(table_multiple_df_with_race$Count, main = "Box Plot of Number of Incidents Handled by Officers", ylab = "Incidents")
subject_race <- police_data %>%
distinct(SUBJECT_ID, SUBJECT_RACE)
table_multiple <- table(police_data$SUBJECT_ID[duplicated(police_data$SUBJECT_ID)])
table_multiple_sorted <- table_multiple[order(-table_multiple)]
table_multiple_sorted <-table_multiple_sorted[-which(names(table_multiple_sorted) == "0")]
# Convert the table to a data frame and add subject race
table_multiple_df_s <- data.frame(SUBJECT_ID = names(table_multiple_sorted),
Count = as.vector(table_multiple_sorted))
table_multiple_df_with_race_s <- left_join(table_multiple_df_s, subject_race,
by = "SUBJECT_ID")
# Select the top 30 values
table_top_30_subjects_s <- table_multiple_df_with_race_s %>%
arrange(desc(Count)) %>%
slice(1:30)
# Print the table
kable(table_top_30_subjects_s, caption = "Subjects with multiple incidents and their races")
| SUBJECT_ID | Count | SUBJECT_RACE |
|---|---|---|
| 43676 | 8 | Black |
| 44942 | 7 | Black |
| 45855 | 6 | White |
| 47548 | 6 | Black |
| 43966 | 5 | Black |
| 44127 | 5 | Black |
| 44918 | 5 | Black |
| 46671 | 5 | Black |
| 46815 | 5 | Black |
| 26063 | 4 | Black |
| 3087 | 4 | other |
| 43301 | 4 | Black |
| 43438 | 4 | Hispanic |
| 43527 | 4 | White |
| 43555 | 4 | Black |
| 43621 | 4 | Black |
| 43773 | 4 | White |
| 43819 | 4 | Black |
| 45650 | 4 | Black |
| 45867 | 4 | Black |
| 46864 | 4 | White |
| 47619 | 4 | Black |
| 14267 | 3 | Black |
| 17083 | 3 | Black |
| 19604 | 3 | White |
| 26044 | 3 | Black |
| 28803 | 3 | Black |
| 29652 | 3 | Black |
| 32504 | 3 | Black |
| 32881 | 3 | Black |
As we can se the Outliers in this graph with Subjects involved in multiple Incidents
boxplot(table_multiple_df_with_race_s$Count, main = "Box Plot of Number of Incidents Subject is involved", ylab = "Incidents")
## Plotting Some Time series To Analyse the Timeline in which Incidents
Occured
# Create a line chart of the number of incidents by month
ggplot(police_data, aes(x = month, y = ID)) +
geom_smooth()+
scale_x_continuous(breaks = seq(1, 12, by = 1))+
labs(x="Month",y="Incidents",title = "Incidents happened in each month from 1-12")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Create a line chart of the number of incidents by day
ggplot(police_data, aes(x = day, y = ID)) +
geom_smooth()+
scale_x_continuous(breaks = seq(1, 30, by = 1))+
labs(x="Day of a Month",y="Incidents",title = "Incidents happened in each day from 1-30")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
# Create a data frame with the number of incidents per reporting area
incidents_by_area <- police_data %>%
group_by(REPORTING_AREA) %>%
summarise(Incidents = n())
# Plot a density plot of the distribution of incidents per reporting area
ggplot(incidents_by_area, aes(x = Incidents)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(x = "Number of Incidents", y = "Density", title = "Distribution of Incidents per Reporting Area")
#### This graph shows that some Reporting Areas have very high Incidents
count compared to other. This may imply that certain areas are more
prone to crime or officers are biased in certain areas
# Group by subject race and gender, calculate percentage of incidents in which subject was injured
subject_injured <- police_data %>%
filter(SUBJECT_RACE != "other") %>%
group_by(SUBJECT_RACE, SUBJECT_GENDER) %>%
summarise(percent_injured = mean(SUBJECT_INJURY == "Yes") * 100, .groups = "drop")
# Plot density plot of distribution of percentage of incidents in which subject was injured
ggplot(subject_injured, aes(x = percent_injured)) +
geom_density(fill = "blue", alpha = 0.5) +
labs(x = "Percentage of Incidents in which Subject was Injured", y = "Density",
title = "Distribution of the Percentage of by Race and Gender") +
facet_wrap(~ SUBJECT_RACE, nrow = 2)
#### Graphs for American Indian and Asian are Empty as there are not
much values here to display
# Create the box plot
ggplot(police_data, aes(x = OFFICER_RACE, y = OFFICER_YEARS_ON_FORCE)) +
geom_boxplot(fill = "#69b3a2", color = "#264653", alpha = 0.8) +
ggtitle("Experience of Officer Race") +
xlab("Officer Race") +
ylab("Experience of Officer in years") +
theme_minimal() +
theme(plot.title = element_text(color = "#264653", size = 18, face = "bold"),
axis.title.x = element_text(color = "#2a9d8f", size = 14, face = "bold"),
axis.title.y = element_text(color = "#2a9d8f", size = 14, face = "bold"),
axis.text.x = element_text(color = "#2a9d8f", size = 12),
axis.text.y = element_text(color = "#2a9d8f", size = 12))
## Scatter Plot Showing Location Districts and Number of Incidents
happened in them The graph shows highest Number of incidents in d14 and
D2 and lowest in D1
df_monthly <- aggregate(ID ~ month, data = police_data, FUN = length)
df_LOCATION_DISTRICT <- aggregate(ID ~ LOCATION_DISTRICT, data = police_data, FUN = length)
scterplot_district<-ggplot(df_LOCATION_DISTRICT, aes(x = LOCATION_DISTRICT, y = ID)) +
geom_point() +
labs(x = "LOCATION_DISTRICT", y = "Number of Incidents", title = "LOCATION_DISTRICT Incidents") +
theme_minimal()
ggplotly(scterplot_district)
From the plot, It can be observed that there is a negative correlation between experience of Officer and Forces Used on subject which implies that more experienced officers use less force to tackle the situations.
# separating Numerical colums from the data
numeric_cols <- sapply(police_data, is.numeric)
# creating a dataframe for the numerical columns
police_numeric <- police_data[, numeric_cols]
# applying correlation function
correlation_police <- cor(police_numeric)
#print(correlation_police)
police_corplot <-corrplot(correlation_police, type = "lower", method = "circle")
### Maps to represent the location in which the incidents happened
library(leaflet)
# Create a leaflet map
police_data$LOCATION_LATITUDE <- as.numeric(police_data$LOCATION_LATITUDE)
police_data$LOCATION_LONGITUDE <- as.numeric(police_data$LOCATION_LONGITUDE)
mean(na.omit(police_data$LOCATION_LATITUDE))
## [1] 32.80196
m <- leaflet(police_data) %>%
addTiles() %>%
setView(lat = mean(na.omit(police_data$LOCATION_LATITUDE)), lng = mean(na.omit(police_data$LOCATION_LONGITUDE)), zoom = 12) # set the initial map view
any(!is.numeric(police_data$LOCATION_LONGITUDE))
## [1] FALSE
any(!is.numeric(police_data$LOCATION_LATITUDE))
## [1] FALSE
# Add markers to the map for each incident
m <- m %>% addMarkers(
~police_data$LOCATION_LONGITUDE, ~police_data$LOCATION_LATITUDE, # specify the longitude and latitude columns
popup = paste("<b>Incident Reason:</b>", police_data$INCIDENT_REASON, "<br>",
"<b>Officer Gender:</b>", police_data$OFFICER_GENDER, "<br>",
"<b>Subject Race:</b>", police_data$SUBJECT_RACE) # specify the popup text
)
## Warning in validateCoords(lng, lat, funcName): Data contains 55 rows with either
## missing or invalid lat/lon values and will be ignored
# Add a legend to the map for the INCIDENT_REASON variable
m <- m %>% addLegend(
position = "topright",
title = "Incident Reason",
colors = rainbow(14),
labels = unique(police_data$INCIDENT_REASON)
)
length(unique(police_data$INCIDENT_REASON))
## [1] 14
# Add a layer control to the map to toggle the markers and legend
m <- m %>% addLayersControl(
baseGroups = c("Default", "Grayscale", "Streets", "Outdoors", "Satellite"),
overlayGroups = c("Markers", "Legend"),
options = layersControlOptions(collapsed = FALSE)
)
# Display the map
m
library(shiny)
library(ggplot2)
# Read in the police_data dataset
# Define the UI
ui <- fluidPage(
titlePanel("Police Data Visualization"),
sidebarLayout(
sidebarPanel(
selectInput("race", "Select a race:", choices = unique(police_data$SUBJECT_RACE))
),
mainPanel(
plotOutput("plot")
)
)
)
# Define the server
server <- function(input, output) {
output$plot <- renderPlot({
# Subset the data based on the selected race
subset_data <- subset(police_data, SUBJECT_RACE == input$race)
# creating a boxplot
boxplot(subset_data$force_type_count, main = "distribution of Number of forces applied", ylab = "forces Count")
})
}
# Run the app
shinyApp(ui, server)
We can observe data by analysing various graphical charts present above * The police force in Dallas includes mostly white officers followed by Hispanics and then black people. * Majority of police have officers of all ethnicity have less than five years of experience and which shows there is a very high imbalance in the races of subjects and the races of officers. * Next point most of the officers handles less than five incidents and having more than 10 incidents can be considered as outliers. * Time series graph shows significant be less cases in the month seven and eight than in the month first and second. * Some areas or more prone to crimes than other as indicated by density plots * Overall mean experience of black officers or marginally higher than other ethnicities * Map is plotted to highlight the exact pin point where the incident happened. * An interactive graph is plotted to show the distribution of number of types of forces used by the officers of different ethnicities and experiences.